UCI Adult Data Set

Dataset URL: https://archive.ics.uci.edu/ml/datasets/adult

Predict whether income exceeds $50K/yr based on census data. Also known as "Census Income" dataset.


In [1]:
import shutil
import math
from datetime import datetime
import multiprocessing

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import data
from tensorflow.python.feature_column import feature_column

print(tf.__version__)


/Users/khalidsalama/anaconda/lib/python3.6/importlib/_bootstrap.py:205: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6
  return f(*args, **kwds)
1.4.1

In [2]:
MODEL_NAME = 'cenus-model-02'

TRAIN_DATA_FILES_PATTERN = 'data/adult.data.csv'
TEST_DATA_FILES_PATTERN = 'data/adult.test.csv'

RESUME_TRAINING = False
PROCESS_FEATURES = True
EXTEND_FEATURE_COLUMNS = True
MULTI_THREADING = True

Define Dataset Metadata


In [3]:
HEADER = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',
               'marital_status', 'occupation', 'relationship', 'race', 'gender',
               'capital_gain', 'capital_loss', 'hours_per_week',
               'native_country', 'income_bracket']

HEADER_DEFAULTS = [[0], [''], [0], [''], [0], [''], [''], [''], [''], [''],
                       [0], [0], [0], [''], ['']]

NUMERIC_FEATURE_NAMES = ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY = {
    'gender': ['Female', 'Male'],
    
    'race': ['Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other', 'White'],
    
    'education': ['Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college', 
                  'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school', 
                  '5th-6th', '10th', '1st-4th', 'Preschool', '12th'],
    
    'marital_status': ['Married-civ-spouse', 'Divorced', 'Married-spouse-absent', 
                       'Never-married', 'Separated', 'Married-AF-spouse', 'Widowed'],
    
    'relationship': ['Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried', 'Other-relative'],
    
    'workclass': ['Self-emp-not-inc', 'Private', 'State-gov', 'Federal-gov', 'Local-gov', '?', 
                  'Self-emp-inc', 'Without-pay', 'Never-worked']
}

CATEGORICAL_FEATURE_NAMES_WITH_BUCKET_SIZE = {
    'occupation': 50,
    'native_country' : 100
}

CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY.keys()) + list(CATEGORICAL_FEATURE_NAMES_WITH_BUCKET_SIZE.keys())

FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES

TARGET_NAME = 'income_bracket'

TARGET_LABELS = ['<=50K', '>50K']

WEIGHT_COLUMN_NAME = 'fnlwgt'

UNUSED_FEATURE_NAMES = list(set(HEADER) - set(FEATURE_NAMES) - {TARGET_NAME} - {WEIGHT_COLUMN_NAME})


print("Header: {}".format(HEADER))
print("Numeric Features: {}".format(NUMERIC_FEATURE_NAMES))
print("Categorical Features: {}".format(CATEGORICAL_FEATURE_NAMES))
print("Target: {} - labels: {}".format(TARGET_NAME, TARGET_LABELS))
print("Unused Features: {}".format(UNUSED_FEATURE_NAMES))


Header: ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income_bracket']
Numeric Features: ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
Categorical Features: ['gender', 'race', 'education', 'marital_status', 'relationship', 'workclass', 'occupation', 'native_country']
Target: income_bracket - labels: ['<=50K', '>50K']
Unused Features: []

Load and Analyse Dataset


In [4]:
TRAIN_DATA_SIZE = 32561
TEST_DATA_SIZE = 16278

train_data = pd.read_csv(TRAIN_DATA_FILES_PATTERN, header=None, names=HEADER )
train_data.head(10)


Out[4]:
age workclass fnlwgt education education_num marital_status occupation relationship race gender capital_gain capital_loss hours_per_week native_country income_bracket
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K
5 37 Private 284582 Masters 14 Married-civ-spouse Exec-managerial Wife White Female 0 0 40 United-States <=50K
6 49 Private 160187 9th 5 Married-spouse-absent Other-service Not-in-family Black Female 0 0 16 Jamaica <=50K
7 52 Self-emp-not-inc 209642 HS-grad 9 Married-civ-spouse Exec-managerial Husband White Male 0 0 45 United-States >50K
8 31 Private 45781 Masters 14 Never-married Prof-specialty Not-in-family White Female 14084 0 50 United-States >50K
9 42 Private 159449 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 5178 0 40 United-States >50K

In [5]:
train_data.describe()


Out[5]:
age fnlwgt education_num capital_gain capital_loss hours_per_week
count 32561.000000 3.256100e+04 32561.000000 32561.000000 32561.000000 32561.000000
mean 38.581647 1.897784e+05 10.080679 1077.648844 87.303830 40.437456
std 13.640433 1.055500e+05 2.572720 7385.292085 402.960219 12.347429
min 17.000000 1.228500e+04 1.000000 0.000000 0.000000 1.000000
25% 28.000000 1.178270e+05 9.000000 0.000000 0.000000 40.000000
50% 37.000000 1.783560e+05 10.000000 0.000000 0.000000 40.000000
75% 48.000000 2.370510e+05 12.000000 0.000000 0.000000 45.000000
max 90.000000 1.484705e+06 16.000000 99999.000000 4356.000000 99.000000

Compute Scaling Statistics for Numeric Columns


In [6]:
means = train_data[NUMERIC_FEATURE_NAMES].mean(axis=0)
stdvs = train_data[NUMERIC_FEATURE_NAMES].std(axis=0)
maxs = train_data[NUMERIC_FEATURE_NAMES].max(axis=0)
mins = train_data[NUMERIC_FEATURE_NAMES].min(axis=0)
df_stats = pd.DataFrame({"mean":means, "stdv":stdvs, "max":maxs, "min":mins})
df_stats.head(15)


Out[6]:
max mean min stdv
age 90 38.581647 17 13.640433
education_num 16 10.080679 1 2.572720
capital_gain 99999 1077.648844 0 7385.292085
capital_loss 4356 87.303830 0 402.960219
hours_per_week 99 40.437456 1 12.347429

Save Scaling Statistics


In [7]:
df_stats.to_csv(path_or_buf="data/adult.stats.csv", header=True, index=True)

Define Data Input Function

a. Parsing and preprocessing logic


In [8]:
def parse_csv_row(csv_row):
    
    columns = tf.decode_csv(csv_row, record_defaults=HEADER_DEFAULTS)
    features = dict(zip(HEADER, columns))
    
    for column in UNUSED_FEATURE_NAMES:
        features.pop(column)
    
    target = features.pop(TARGET_NAME)

    return features, target

def process_features(features):

    capital_indicator = features['capital_gain'] > features['capital_loss']
    features['capital_indicator'] = tf.cast(capital_indicator, dtype=tf.int32)
    
    return features

b. Data pipeline input function


In [9]:
def parse_label_column(label_string_tensor):
    table = tf.contrib.lookup.index_table_from_tensor(tf.constant(TARGET_LABELS))
    return table.lookup(label_string_tensor)

In [10]:
def csv_input_fn(files_name_pattern, mode=tf.estimator.ModeKeys.EVAL, 
                 skip_header_lines=0, 
                 num_epochs=None, 
                 batch_size=200):
    
    shuffle = True if mode == tf.estimator.ModeKeys.TRAIN else False
        
    num_threads = multiprocessing.cpu_count() if MULTI_THREADING else 1
     
    print("")
    print("* data input_fn:")
    print("================")
    print("Input file(s): {}".format(files_name_pattern))
    print("Batch size: {}".format(batch_size))
    print("Epoch Count: {}".format(num_epochs))
    print("Mode: {}".format(mode))
    print("Thread Count: {}".format(num_threads))
    print("Shuffle: {}".format(shuffle))
    print("================")
    print("")

    file_names = tf.matching_files(files_name_pattern)
    dataset = data.TextLineDataset(filenames=file_names)
    
    dataset = dataset.skip(skip_header_lines)
    
    if shuffle:
        dataset = dataset.shuffle(buffer_size=2 * batch_size + 1)
    
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(lambda csv_row: parse_csv_row(csv_row), 
                          num_parallel_calls=num_threads)
    
    if PROCESS_FEATURES:
        dataset = dataset.map(lambda features, target: (process_features(features), target), 
                              num_parallel_calls=num_threads)
        
    dataset = dataset.repeat(num_epochs)
    iterator = dataset.make_one_shot_iterator()
    
    features, target = iterator.get_next()
    return features, parse_label_column(target)

In [11]:
features, target = csv_input_fn(files_name_pattern="")
print("Features in CSV: {}".format(list(features.keys())))
print("Target in CSV: {}".format(target))


* data input_fn:
================
Input file(s): 
Batch size: 200
Epoch Count: None
Mode: eval
Thread Count: 4
Shuffle: False
================

Features in CSV: ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'capital_indicator']
Target in CSV: Tensor("hash_table_Lookup:0", shape=(?,), dtype=int64)

Define Feature Columns

a. Load scaling params


In [12]:
df_stats = pd.read_csv("data/adult.stats.csv", header=0, index_col=0)
df_stats['feature_name'] = NUMERIC_FEATURE_NAMES
df_stats.head(10)


Out[12]:
max mean min stdv feature_name
age 90 38.581647 17 13.640433 age
education_num 16 10.080679 1 2.572720 education_num
capital_gain 99999 1077.648844 0 7385.292085 capital_gain
capital_loss 4356 87.303830 0 402.960219 capital_loss
hours_per_week 99 40.437456 1 12.347429 hours_per_week

b. Create feature columns


In [13]:
def extend_feature_columns(feature_columns, hparams):
    
    age_buckets = tf.feature_column.bucketized_column(
      feature_columns['age'], boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
    
    education_X_occupation = tf.feature_column.crossed_column(
     ['education', 'occupation'], hash_bucket_size=int(1e4))
    
    age_buckets_X_race = tf.feature_column.crossed_column(
     [age_buckets, feature_columns['race']], hash_bucket_size=int(1e4))
    
    native_country_X_occupation = tf.feature_column.crossed_column(
          ['native_country', 'occupation'], hash_bucket_size=int(1e4))
    
    native_country_embedded = tf.feature_column.embedding_column(
          feature_columns['native_country'], dimension=hparams.embedding_size)
    
    occupation_embedded = tf.feature_column.embedding_column(
          feature_columns['occupation'], dimension=hparams.embedding_size)
    
    education_X_occupation_embedded = tf.feature_column.embedding_column(
          education_X_occupation, dimension=hparams.embedding_size)
    
    native_country_X_occupation_embedded = tf.feature_column.embedding_column(
          native_country_X_occupation, dimension=hparams.embedding_size)
    
    
    feature_columns['age_buckets'] = age_buckets
    feature_columns['education_X_occupation'] = education_X_occupation
    feature_columns['age_buckets_X_race'] = age_buckets_X_race
    feature_columns['native_country_X_occupation'] = native_country_X_occupation
    feature_columns['native_country_embedded'] = native_country_embedded
    feature_columns['occupation_embedded'] = occupation_embedded
    feature_columns['education_X_occupation_embedded'] = education_X_occupation_embedded
    feature_columns['native_country_X_occupation_embedded'] = native_country_X_occupation_embedded
    
    return feature_columns

def standard_scaler(x, mean, stdv):
    return (x-mean)/(stdv)

def maxmin_scaler(x, max_value, min_value):
    return (x-min_value)/(max_value-min_value)  

def get_feature_columns(hparams):
    
    numeric_columns = {}
    
    for feature_name in NUMERIC_FEATURE_NAMES:

        feature_mean = df_stats[df_stats.feature_name == feature_name]['mean'].values[0]
        feature_stdv = df_stats[df_stats.feature_name == feature_name]['stdv'].values[0]
        normalizer_fn = lambda x: standard_scaler(x, feature_mean, feature_stdv)
        
        numeric_columns[feature_name] = tf.feature_column.numeric_column(feature_name, 
                                                                         normalizer_fn=normalizer_fn
                                                                        )
    CONSTRUCTED_NUMERIC_FEATURES_NAMES = []
    
    if PROCESS_FEATURES:
        for feature_name in CONSTRUCTED_NUMERIC_FEATURES_NAMES:
            numeric_columns[feature_name] = tf.feature_column.numeric_column(feature_name)
    
    categorical_column_with_vocabulary = \
        {item[0]: tf.feature_column.categorical_column_with_vocabulary_list(item[0], item[1])
         for item in CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY.items()}
        
    CONSTRUCTED_INDICATOR_FEATURES_NAMES = ['capital_indicator']
    
    categorical_column_with_identity = {}
    
    for feature_name in CONSTRUCTED_INDICATOR_FEATURES_NAMES: 
        categorical_column_with_identity[feature_name] = tf.feature_column.categorical_column_with_identity(feature_name, 
                                                                                                              num_buckets=2,
                                                                                                              default_value=0)
    categorical_column_with_hash_bucket = \
        {item[0]: tf.feature_column.categorical_column_with_hash_bucket(item[0], item[1], dtype=tf.string)
         for item in CATEGORICAL_FEATURE_NAMES_WITH_BUCKET_SIZE.items()}
        
    feature_columns = {}

    if numeric_columns is not None:
        feature_columns.update(numeric_columns)

    if categorical_column_with_vocabulary is not None:
        feature_columns.update(categorical_column_with_vocabulary)
        
    if categorical_column_with_identity is not None:
        feature_columns.update(categorical_column_with_identity)
        
    if categorical_column_with_hash_bucket is not None:
        feature_columns.update(categorical_column_with_hash_bucket)
    
    if EXTEND_FEATURE_COLUMNS:
        feature_columns = extend_feature_columns(feature_columns, hparams)
        
    return feature_columns

feature_columns = get_feature_columns(tf.contrib.training.HParams(num_buckets=5,embedding_size=3))
print("Feature Columns: {}".format(feature_columns))


Feature Columns: {'age': _NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x125ea0d90>), 'education_num': _NumericColumn(key='education_num', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x125ea0a60>), 'capital_gain': _NumericColumn(key='capital_gain', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x125ea0ea0>), 'capital_loss': _NumericColumn(key='capital_loss', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x125ea0f28>), 'hours_per_week': _NumericColumn(key='hours_per_week', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x125ea07b8>), 'gender': _VocabularyListCategoricalColumn(key='gender', vocabulary_list=('Female', 'Male'), dtype=tf.string, default_value=-1, num_oov_buckets=0), 'race': _VocabularyListCategoricalColumn(key='race', vocabulary_list=('Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other', 'White'), dtype=tf.string, default_value=-1, num_oov_buckets=0), 'education': _VocabularyListCategoricalColumn(key='education', vocabulary_list=('Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college', 'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school', '5th-6th', '10th', '1st-4th', 'Preschool', '12th'), dtype=tf.string, default_value=-1, num_oov_buckets=0), 'marital_status': _VocabularyListCategoricalColumn(key='marital_status', vocabulary_list=('Married-civ-spouse', 'Divorced', 'Married-spouse-absent', 'Never-married', 'Separated', 'Married-AF-spouse', 'Widowed'), dtype=tf.string, default_value=-1, num_oov_buckets=0), 'relationship': _VocabularyListCategoricalColumn(key='relationship', vocabulary_list=('Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried', 'Other-relative'), dtype=tf.string, default_value=-1, num_oov_buckets=0), 'workclass': _VocabularyListCategoricalColumn(key='workclass', vocabulary_list=('Self-emp-not-inc', 'Private', 'State-gov', 'Federal-gov', 'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'), dtype=tf.string, default_value=-1, num_oov_buckets=0), 'capital_indicator': _IdentityCategoricalColumn(key='capital_indicator', num_buckets=2, default_value=0), 'occupation': _HashedCategoricalColumn(key='occupation', hash_bucket_size=50, dtype=tf.string), 'native_country': _HashedCategoricalColumn(key='native_country', hash_bucket_size=100, dtype=tf.string), 'age_buckets': _BucketizedColumn(source_column=_NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x125ea0d90>), boundaries=(18, 25, 30, 35, 40, 45, 50, 55, 60, 65)), 'education_X_occupation': _CrossedColumn(keys=('education', 'occupation'), hash_bucket_size=10000, hash_key=None), 'age_buckets_X_race': _CrossedColumn(keys=(_BucketizedColumn(source_column=_NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x125ea0d90>), boundaries=(18, 25, 30, 35, 40, 45, 50, 55, 60, 65)), _VocabularyListCategoricalColumn(key='race', vocabulary_list=('Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other', 'White'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), hash_bucket_size=10000, hash_key=None), 'native_country_X_occupation': _CrossedColumn(keys=('native_country', 'occupation'), hash_bucket_size=10000, hash_key=None), 'native_country_embedded': _EmbeddingColumn(categorical_column=_HashedCategoricalColumn(key='native_country', hash_bucket_size=100, dtype=tf.string), dimension=3, combiner='mean', initializer=<tensorflow.python.ops.init_ops.TruncatedNormal object at 0x125eef358>, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainable=True), 'occupation_embedded': _EmbeddingColumn(categorical_column=_HashedCategoricalColumn(key='occupation', hash_bucket_size=50, dtype=tf.string), dimension=3, combiner='mean', initializer=<tensorflow.python.ops.init_ops.TruncatedNormal object at 0x125eef390>, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainable=True), 'education_X_occupation_embedded': _EmbeddingColumn(categorical_column=_CrossedColumn(keys=('education', 'occupation'), hash_bucket_size=10000, hash_key=None), dimension=3, combiner='mean', initializer=<tensorflow.python.ops.init_ops.TruncatedNormal object at 0x125eef3c8>, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainable=True), 'native_country_X_occupation_embedded': _EmbeddingColumn(categorical_column=_CrossedColumn(keys=('native_country', 'occupation'), hash_bucket_size=10000, hash_key=None), dimension=3, combiner='mean', initializer=<tensorflow.python.ops.init_ops.TruncatedNormal object at 0x125eef400>, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainable=True)}

Define a Model Function for the Custom Estimator

a. Prepare input layer feature columns


In [14]:
def get_input_layer_feature_columns(hparams):
    
    feature_columns = list(get_feature_columns(hparams).values())
    
    dense_columns = list(
        filter(lambda column: isinstance(column, feature_column._NumericColumn) |
                              isinstance(column, feature_column._EmbeddingColumn),
               feature_columns
        )
    )

    categorical_columns = list(
        filter(lambda column: isinstance(column, feature_column._VocabularyListCategoricalColumn) |
                              isinstance(column, feature_column._BucketizedColumn),
                   feature_columns)
    )
    

    indicator_columns = list(
            map(lambda column: tf.feature_column.indicator_column(column),
                categorical_columns)
    )
    
    return dense_columns + indicator_columns

b. Define model_fn + exponential decay learning rate


In [15]:
def model_fn(features, labels, mode, params):

    hidden_units = params.hidden_units
    output_layer_size = len(TARGET_LABELS)

    feature_columns = get_input_layer_feature_columns(hparams)

    # Create the input layers from the feature columns
    input_layer = tf.feature_column.input_layer(features=features, 
                                                feature_columns=feature_columns)


    # Create a fully-connected layer-stack based on the hidden_units in the params
    hidden_layers = tf.contrib.layers.stack(inputs= input_layer,
                                            layer= tf.contrib.layers.fully_connected,
                                            stack_args= hidden_units)

    # Connect the output layer (logits) to the hidden layer (no activation fn)
    logits = tf.layers.dense(inputs=hidden_layers, 
                             units=output_layer_size)

    # Reshape output layer to 1-dim Tensor to return predictions
    output = tf.squeeze(logits)

    # Provide an estimator spec for `ModeKeys.PREDICT`.
    if mode == tf.estimator.ModeKeys.PREDICT:
        probabilities = tf.nn.softmax(logits)
        predicted_indices = tf.argmax(probabilities, 1)

        # Convert predicted_indices back into strings
        predictions = {
            'class': tf.gather(TARGET_LABELS, predicted_indices),
            'probabilities': probabilities
        }
        export_outputs = {
            'prediction': tf.estimator.export.PredictOutput(predictions)
        }
        
        # Provide an estimator spec for `ModeKeys.PREDICT` modes.
        return tf.estimator.EstimatorSpec(mode,
                                          predictions=predictions,
                                          export_outputs=export_outputs)

    
    weights = features[WEIGHT_COLUMN_NAME]
    
    # Calculate loss using softmax cross entropy
    loss = tf.losses.sparse_softmax_cross_entropy(
        logits=logits, 
        labels=labels,
        weights=weights
    )
    
    
    tf.summary.scalar('loss', loss)
    
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        
        
        
        # Learning rate scheduler using exponential decay
        initial_learning_rate = params.learning_rate
        decay_steps = params.num_epochs
        decay_rate = 0.1  # if set to 1, then no decay. Set to smaller value to reach while decaying
        
        global_step = tf.train.get_global_step()
        
        # decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)
        learning_rate = tf.train.exponential_decay(initial_learning_rate, global_step,
                                                       decay_steps, decay_rate)

        # Create Optimiser
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)

        # Create training operation
        train_op = optimizer.minimize(
            loss=loss, global_step=global_step)

        # Provide an estimator spec for `ModeKeys.TRAIN` modes.
        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=loss, 
                                          train_op=train_op)
        


    if mode == tf.estimator.ModeKeys.EVAL:
        probabilities = tf.nn.softmax(logits)
        predicted_indices = tf.argmax(probabilities, 1)

        # Return accuracy and area under ROC curve metrics
        labels_one_hot = tf.one_hot(
            labels,
            depth=len(TARGET_LABELS),
            on_value=True,
            off_value=False,
            dtype=tf.bool
        )
        
        eval_metric_ops = {
            'accuracy': tf.metrics.accuracy(labels, predicted_indices),
            'auroc': tf.metrics.auc(labels_one_hot, probabilities)
        }
        
        # Provide an estimator spec for `ModeKeys.EVAL` modes.
        return tf.estimator.EstimatorSpec(mode, 
                                          loss=loss, 
                                          eval_metric_ops=eval_metric_ops)



def create_estimator(run_config, hparams):
    estimator = tf.estimator.Estimator(model_fn=classification_model_fn, 
                                  params=hparams, 
                                  config=run_config)
    
    print("")
    print("Estimator Type: {}".format(type(estimator)))
    print("")

    return estimator

b. Create the custom estimator


In [16]:
def create_custom_estimator(run_config, hparams):
    
    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       params=hparams,
                                       config= run_config
                                      )
    return estimator

6. Run Experiment

a. Set HParam and RunConfig


In [17]:
TRAIN_SIZE = TRAIN_DATA_SIZE
NUM_EPOCHS = 100
BATCH_SIZE = 500
EVAL_AFTER_SEC = 60
TOTAL_STEPS = (TRAIN_SIZE/BATCH_SIZE)*NUM_EPOCHS

hparams  = tf.contrib.training.HParams(
    num_epochs = NUM_EPOCHS,
    batch_size = BATCH_SIZE,
    embedding_size = 4,
    hidden_units= [64, 32, 16],
    max_steps = TOTAL_STEPS,
    learning_rate = 0.5
)

model_dir = 'trained_models/{}'.format(MODEL_NAME)

run_config = tf.estimator.RunConfig(
    log_step_count_steps=5000,
    tf_random_seed=19830610,
    model_dir=model_dir
)

print(hparams)
print("Model Directory:", run_config.model_dir)
print("")
print("Dataset Size:", TRAIN_SIZE)
print("Batch Size:", BATCH_SIZE)
print("Steps per Epoch:",TRAIN_SIZE/BATCH_SIZE)
print("Total Steps:", TOTAL_STEPS)
print("That is 1 evaluation step after each",EVAL_AFTER_SEC," training seconds")


[('batch_size', 500), ('embedding_size', 4), ('hidden_units', [64, 32, 16]), ('learning_rate', 0.5), ('max_steps', 6512.2), ('num_epochs', 100)]
Model Directory: trained_models/cenus-model-02

Dataset Size: 32561
Batch Size: 500
Steps per Epoch: 65.122
Total Steps: 6512.2
That is 1 evaluation step after each 60  training seconds

b. Define a JSON serving function


In [18]:
def json_serving_input_fn():
    
    receiver_tensor = {}

    for feature_name in FEATURE_NAMES:
        dtype = tf.float32 if feature_name in NUMERIC_FEATURE_NAMES else tf.string
        receiver_tensor[feature_name] = tf.placeholder(shape=[None], dtype=dtype)

    if PROCESS_FEATURES:
        features = process_features(receiver_tensor)

    return tf.estimator.export.ServingInputReceiver(
        features, receiver_tensor)

b. Define TrainSpec and EvaluSpec


In [19]:
train_spec = tf.estimator.TrainSpec(
    input_fn = lambda: csv_input_fn(
        TRAIN_DATA_FILES_PATTERN,
        mode = tf.estimator.ModeKeys.TRAIN,
        num_epochs=hparams.num_epochs,
        batch_size=hparams.batch_size
    ),
    max_steps=hparams.max_steps,
    hooks=None
)

eval_spec = tf.estimator.EvalSpec(
    input_fn = lambda: csv_input_fn(
        TRAIN_DATA_FILES_PATTERN,
        mode=tf.estimator.ModeKeys.EVAL,
        num_epochs=1,
        batch_size=hparams.batch_size,
            
    ),
    exporters=[tf.estimator.LatestExporter(
        name="predict", # the name of the folder in which the model will be exported to under export
        serving_input_receiver_fn=json_serving_input_fn,
        exports_to_keep=1,
        as_text=False)],
    throttle_secs = EVAL_AFTER_SEC,
    steps=None
)

c. Run Experiment via train_and_evaluate


In [20]:
if not RESUME_TRAINING:
    print("Removing previous artifacts...")
    shutil.rmtree(model_dir, ignore_errors=True)
else:
    print("Resuming training...") 

    
tf.logging.set_verbosity(tf.logging.INFO)

time_start = datetime.utcnow() 
print("Experiment started at {}".format(time_start.strftime("%H:%M:%S")))
print(".......................................") 

estimator = create_custom_estimator(run_config, hparams)

tf.estimator.train_and_evaluate(
    estimator=estimator,
    train_spec=train_spec, 
    eval_spec=eval_spec
)

time_end = datetime.utcnow() 
print(".......................................")
print("Experiment finished at {}".format(time_end.strftime("%H:%M:%S")))
print("")
time_elapsed = time_end - time_start
print("Experiment elapsed time: {} seconds".format(time_elapsed.total_seconds()))


Removing previous artifacts...
Experiment started at 18:12:11
.......................................
INFO:tensorflow:Using config: {'_model_dir': 'trained_models/cenus-model-02', '_tf_random_seed': 19830610, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 5000, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x125edce80>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after 60 secs (eval_spec.throttle_secs) or training is finished.

* data input_fn:
================
Input file(s): data/adult.data.csv
Batch size: 500
Epoch Count: 100
Mode: train
Thread Count: 4
Shuffle: True
================

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into trained_models/cenus-model-02/model.ckpt.
INFO:tensorflow:loss = 1.30918e+06, step = 1
INFO:tensorflow:loss = 97091.8, step = 101 (1.007 sec)
INFO:tensorflow:loss = 90555.1, step = 201 (0.737 sec)
INFO:tensorflow:loss = 89721.9, step = 301 (0.706 sec)
INFO:tensorflow:loss = 96130.4, step = 401 (0.754 sec)
INFO:tensorflow:loss = 81176.2, step = 501 (0.725 sec)
INFO:tensorflow:loss = 85757.6, step = 601 (0.808 sec)
INFO:tensorflow:loss = 88907.4, step = 701 (0.771 sec)
INFO:tensorflow:loss = 87431.0, step = 801 (0.767 sec)
INFO:tensorflow:loss = 78291.1, step = 901 (0.703 sec)
INFO:tensorflow:loss = 90324.9, step = 1001 (0.697 sec)
INFO:tensorflow:loss = 85454.1, step = 1101 (0.714 sec)
INFO:tensorflow:loss = 90814.7, step = 1201 (0.709 sec)
INFO:tensorflow:loss = 90332.6, step = 1301 (0.694 sec)
INFO:tensorflow:loss = 81174.5, step = 1401 (0.707 sec)
INFO:tensorflow:loss = 85404.7, step = 1501 (0.788 sec)
INFO:tensorflow:loss = 87166.6, step = 1601 (0.773 sec)
INFO:tensorflow:loss = 94913.2, step = 1701 (0.756 sec)
INFO:tensorflow:loss = 87709.0, step = 1801 (0.674 sec)
INFO:tensorflow:loss = 92938.2, step = 1901 (0.630 sec)
INFO:tensorflow:loss = 97875.6, step = 2001 (0.661 sec)
INFO:tensorflow:loss = 86705.6, step = 2101 (0.738 sec)
INFO:tensorflow:loss = 87917.2, step = 2201 (0.878 sec)
INFO:tensorflow:loss = 98207.6, step = 2301 (0.758 sec)
INFO:tensorflow:loss = 105091.0, step = 2401 (0.795 sec)
INFO:tensorflow:loss = 89377.2, step = 2501 (0.813 sec)
INFO:tensorflow:loss = 82537.1, step = 2601 (0.810 sec)
INFO:tensorflow:loss = 88496.2, step = 2701 (0.787 sec)
INFO:tensorflow:loss = 91044.9, step = 2801 (0.861 sec)
INFO:tensorflow:loss = 93448.4, step = 2901 (0.788 sec)
INFO:tensorflow:loss = 84234.8, step = 3001 (0.796 sec)
INFO:tensorflow:loss = 88336.6, step = 3101 (0.834 sec)
INFO:tensorflow:loss = 90194.9, step = 3201 (0.843 sec)
INFO:tensorflow:loss = 93725.6, step = 3301 (0.773 sec)
INFO:tensorflow:loss = 88661.5, step = 3401 (0.781 sec)
INFO:tensorflow:loss = 92526.2, step = 3501 (0.807 sec)
INFO:tensorflow:loss = 85008.0, step = 3601 (0.780 sec)
INFO:tensorflow:loss = 89217.5, step = 3701 (0.793 sec)
INFO:tensorflow:loss = 83617.1, step = 3801 (0.817 sec)
INFO:tensorflow:loss = 94509.3, step = 3901 (0.767 sec)
INFO:tensorflow:loss = 85184.0, step = 4001 (0.755 sec)
INFO:tensorflow:loss = 78407.8, step = 4101 (0.765 sec)
INFO:tensorflow:loss = 84727.1, step = 4201 (0.761 sec)
INFO:tensorflow:loss = 96546.5, step = 4301 (0.805 sec)
INFO:tensorflow:loss = 92925.1, step = 4401 (0.785 sec)
INFO:tensorflow:loss = 84362.9, step = 4501 (0.760 sec)
INFO:tensorflow:loss = 97034.3, step = 4601 (0.772 sec)
INFO:tensorflow:loss = 83214.5, step = 4701 (0.763 sec)
INFO:tensorflow:loss = 99647.5, step = 4801 (0.773 sec)
INFO:tensorflow:loss = 82375.5, step = 4901 (0.768 sec)
INFO:tensorflow:global_step/sec: 129.897
INFO:tensorflow:loss = 86539.2, step = 5001 (0.785 sec)
INFO:tensorflow:loss = 86454.4, step = 5101 (0.824 sec)
INFO:tensorflow:loss = 84581.3, step = 5201 (0.830 sec)
INFO:tensorflow:loss = 85276.0, step = 5301 (0.847 sec)
INFO:tensorflow:loss = 83838.8, step = 5401 (0.815 sec)
INFO:tensorflow:loss = 86496.9, step = 5501 (0.807 sec)
INFO:tensorflow:loss = 95513.3, step = 5601 (0.792 sec)
INFO:tensorflow:loss = 91887.0, step = 5701 (0.779 sec)
INFO:tensorflow:loss = 93085.7, step = 5801 (0.789 sec)
INFO:tensorflow:loss = 77046.2, step = 5901 (0.805 sec)
INFO:tensorflow:loss = 92932.1, step = 6001 (0.770 sec)
INFO:tensorflow:loss = 96961.4, step = 6101 (0.777 sec)
INFO:tensorflow:loss = 82820.8, step = 6201 (0.774 sec)
INFO:tensorflow:loss = 87546.7, step = 6301 (0.772 sec)
INFO:tensorflow:loss = 95067.7, step = 6401 (0.753 sec)
INFO:tensorflow:loss = 93536.8, step = 6501 (0.795 sec)
INFO:tensorflow:Saving checkpoints for 6513 into trained_models/cenus-model-02/model.ckpt.
INFO:tensorflow:Loss for final step: 85792.7.

* data input_fn:
================
Input file(s): data/adult.data.csv
Batch size: 500
Epoch Count: 1
Mode: eval
Thread Count: 4
Shuffle: False
================

INFO:tensorflow:Starting evaluation at 2018-03-02-18:13:16
INFO:tensorflow:Restoring parameters from trained_models/cenus-model-02/model.ckpt-6513
INFO:tensorflow:Finished evaluation at 2018-03-02-18:13:18
INFO:tensorflow:Saving dict for global step 6513: accuracy = 0.771813, auroc = 0.840162, global_step = 6513, loss = 89778.5
INFO:tensorflow:Restoring parameters from trained_models/cenus-model-02/model.ckpt-6513
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: b"trained_models/cenus-model-02/export/predict/temp-b'1520014401'/saved_model.pb"
.......................................
Experiment finished at 18:13:22

Experiment elapsed time: 71.423351 seconds

Evaluate the Model


In [21]:
TRAIN_SIZE = TRAIN_DATA_SIZE
TEST_SIZE = TEST_DATA_SIZE

train_input_fn = lambda: csv_input_fn(files_name_pattern= TRAIN_DATA_FILES_PATTERN, 
                                      mode= tf.estimator.ModeKeys.EVAL,
                                      batch_size= TRAIN_SIZE)

test_input_fn = lambda: csv_input_fn(files_name_pattern= TEST_DATA_FILES_PATTERN, 
                                      mode= tf.estimator.ModeKeys.EVAL,
                                      batch_size= TEST_SIZE)

estimator = create_custom_estimator(run_config, hparams)

train_results = estimator.evaluate(input_fn=train_input_fn, steps=1)
print()
print("######################################################################################")
print("# Train Measures: {}".format(train_results))
print("######################################################################################")

test_results = estimator.evaluate(input_fn=test_input_fn, steps=1)
print()
print("######################################################################################")
print("# Test Measures: {}".format(test_results))
print("######################################################################################")


INFO:tensorflow:Using config: {'_model_dir': 'trained_models/cenus-model-02', '_tf_random_seed': 19830610, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 5000, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x125edce80>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

* data input_fn:
================
Input file(s): data/adult.data.csv
Batch size: 32561
Epoch Count: None
Mode: eval
Thread Count: 4
Shuffle: False
================

INFO:tensorflow:Starting evaluation at 2018-03-02-18:13:24
INFO:tensorflow:Restoring parameters from trained_models/cenus-model-02/model.ckpt-6513
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2018-03-02-18:13:26
INFO:tensorflow:Saving dict for global step 6513: accuracy = 0.771813, auroc = 0.840162, global_step = 6513, loss = 89734.4

######################################################################################
# Train Measures: {'accuracy': 0.77181292, 'auroc': 0.84016204, 'loss': 89734.406, 'global_step': 6513}
######################################################################################

* data input_fn:
================
Input file(s): data/adult.test.csv
Batch size: 16278
Epoch Count: None
Mode: eval
Thread Count: 4
Shuffle: False
================

INFO:tensorflow:Starting evaluation at 2018-03-02-18:13:27
INFO:tensorflow:Restoring parameters from trained_models/cenus-model-02/model.ckpt-6513
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2018-03-02-18:13:29
INFO:tensorflow:Saving dict for global step 6513: accuracy = 0.775464, auroc = 0.843939, global_step = 6513, loss = 88939.9

######################################################################################
# Test Measures: {'accuracy': 0.77546382, 'auroc': 0.84393936, 'loss': 88939.867, 'global_step': 6513}
######################################################################################

Prediction


In [22]:
import itertools

predict_input_fn = lambda: csv_input_fn(TEST_DATA_FILES_PATTERN, 
                                      mode= tf.estimator.ModeKeys.PREDICT,
                                      batch_size= 10)


predictions = list(itertools.islice(estimator.predict(input_fn=predict_input_fn),10))

print("")

print("* Predicted Classes: {}".format(list(map(lambda item: item["class"]
    ,predictions))))

print("* Predicted Probabilities: {}".format(list(map(lambda item: list(item["probabilities"])
    ,predictions))))


* data input_fn:
================
Input file(s): data/adult.test.csv
Batch size: 10
Epoch Count: None
Mode: infer
Thread Count: 4
Shuffle: False
================

WARNING:tensorflow:Input graph does not contain a QueueRunner. That means predict yields forever. This is probably a mistake.
INFO:tensorflow:Restoring parameters from trained_models/cenus-model-02/model.ckpt-6513

* Predicted Classes: [b'<=50K', b'<=50K', b'<=50K', b'>50K', b'<=50K', b'<=50K', b'<=50K', b'>50K', b'<=50K', b'<=50K']
* Predicted Probabilities: [[0.86306363, 0.13693643], [0.86306363, 0.13693643], [0.86306363, 0.13693643], [0.48736197, 0.51263797], [0.86306363, 0.13693643], [0.86306363, 0.13693643], [0.86306363, 0.13693643], [0.48736197, 0.51263797], [0.86306363, 0.13693643], [0.86306363, 0.13693643]]

Serving Exported Model


In [23]:
import os

export_dir = model_dir +"/export/predict/"

saved_model_dir = export_dir + "/" + os.listdir(path=export_dir)[-1] 

print(saved_model_dir)
print("")

predictor_fn = tf.contrib.predictor.from_saved_model(
    export_dir = saved_model_dir,
    signature_def_key="prediction"
)

output = predictor_fn(
    {
        'age': [34.0],
        'workclass': ['Private'],
        'education': ['Doctorate'],
        'education_num': [10.0],
        'marital_status': ['Married-civ-spouse'],
        'occupation': ['Prof-specialty'],
        'relationship': ['Husband'],
        'race': ['White'],
        'gender': ['Male'],
        'capital_gain': [0.0], 
        'capital_loss': [0.0], 
        'hours_per_week': [40.0],
        'native_country':['Egyptian']
    }
)
print(output)


trained_models/cenus-model-02/export/predict//1520014401

INFO:tensorflow:Restoring parameters from b'trained_models/cenus-model-02/export/predict//1520014401/variables/variables'
{'class': array([b'>50K'], dtype=object), 'probabilities': array([[ 0.48736197,  0.51263797]], dtype=float32)}